if(!require(pacman))
install.packages("pacman")
pacman::p_load(tidyverse,
scales,
devtools,
here,
class,
plotly,
dplyr,
caret,
BiocManager,
smotefamily,
pROC,
mclust,
factoextra,
cluster,
dbscan,
arules,
arulesViz,
themis,
recipes,
e1071,
GGally
)Data Mining Final Project
# Reading the dataset
hepatitis <- read_csv("HepatitisCdata.csv")# Handling the NA values by replacing them by the median of each column
hepatitis <- hepatitis %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
# Verifying that all NA values have been handled
colSums(is.na(hepatitis)) ...1 Category Age Sex ALB ALP ALT AST
0 0 0 0 0 0 0 0
BIL CHE CHOL CREA GGT PROT
0 0 0 0 0 0
write.csv(hepatitis, "HepatitisC_Cleaned.csv", row.names = FALSE)# EDA for the dataset
# Defining a colorblind-friendly palette
color_palette <- c("#440154FF", "#3B528BFF", "#21908CFF", "#5DC863FF", "#FDE725FF")
# Creating an enhanced interactive scatterplot
p <- plot_ly(
data = hepatitis,
x = ~AST,
y = ~ALT,
type = 'scatter',
mode = 'markers',
color = ~as.factor(Category),
colors = color_palette,
marker = list(
size = 8,
opacity = 0.8
),
text = ~paste(
"Category:", case_when(
Category == 0 ~ "Blood Donor",
Category == 1 ~ "Suspect Blood Donor",
Category == 2 ~ "Hepatitis",
Category == 3 ~ "Fibrosis",
Category == 4 ~ "Cirrhosis",
TRUE ~ as.character(Category)
),
"<br>AST:", AST,
"<br>ALT:", ALT,
"<br>Age:", Age
)
) %>%
layout(
title = list(
text = "Interactive Scatterplot: AST vs ALT by Category",
font = list(size = 18)
),
xaxis = list(
title = "AST (Aspartate Transaminase)",
titlefont = list(size = 14),
zeroline = FALSE,
showgrid = FALSE
),
yaxis = list(
title = "ALT (Alanine Transaminase)",
titlefont = list(size = 14),
zeroline = FALSE,
showgrid = FALSE
),
legend = list(
title = list(text = "Category"),
font = list(size = 12),
orientation = "v",
x = 1.1,
y = 0.5,
xanchor = "left"
),
plot_bgcolor = "#FFFFFF",
paper_bgcolor = "#FFFFFF"
)
# Updating legend labels after plot creation
p <- p %>% layout(
legend = list(
title = list(text = "Category"),
font = list(size = 12),
orientation = "v",
x = 1.1,
y = 0.5,
xanchor = "left",
traceorder = "normal",
itemsizing = "constant",
labels = list(
"0" = "Blood Donor",
"1" = "Suspect Blood Donor",
"2" = "Hepatitis",
"3" = "Fibrosis",
"4" = "Cirrhosis"
)
)
)
# Displaying the plot
pClassification Analysis for the dataset Hepatitis-C - Logistic Regression
# Load the dataset
hepatitis <- read.csv("HepatitisC_Cleaned.csv")
# Convert 'Category' into a binary variable (e.g., "Blood Donor" vs. others)
hepatitis <- hepatitis %>%
mutate(BinaryCategory = ifelse(Category == "0=Blood Donor", 1, 0))
# Encode 'Sex' as a numeric variable (e.g., "m" -> 1, "f" -> 0)
hepatitis <- hepatitis %>%
mutate(Sex = ifelse(Sex == "m", 1, 0))
# Exclude unnecessary columns
hepatitis <- hepatitis %>%
select(-...1, -Category) # Remove index and original 'Category'
# Ensure the target variable is a factor
hepatitis$BinaryCategory <- as.factor(hepatitis$BinaryCategory)
# Split the dataset into training and testing sets
set.seed(123) # For reproducibility
train_index <- createDataPartition(hepatitis$BinaryCategory, p = 0.8, list = FALSE)
train_data <- hepatitis[train_index, ]
test_data <- hepatitis[-train_index, ]
# Create a recipe for SMOTE
smote_recipe <- recipe(BinaryCategory ~ ., data = train_data) %>%
step_smote(BinaryCategory, over_ratio = 1) # Balance classes to a 1:1 ratio
# Prepare the SMOTE dataset
smote_data <- prep(smote_recipe) %>%
bake(new_data = NULL)
# Check the class distribution after SMOTE
print("Class distribution after applying SMOTE:")[1] "Class distribution after applying SMOTE:"
print(table(smote_data$BinaryCategory))
0 1
427 427
# Train logistic regression on the SMOTE-balanced dataset
smote_logistic_model <- glm(BinaryCategory ~ ., data = smote_data, family = "binomial")
# Summarize the model
print("Logistic Regression Model Summary:")[1] "Logistic Regression Model Summary:"
print(summary(smote_logistic_model))
Call:
glm(formula = BinaryCategory ~ ., family = "binomial", data = smote_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 6.246494 3.005821 2.078 0.037697 *
Age 0.055334 0.019529 2.833 0.004605 **
Sex 1.277596 0.462124 2.765 0.005699 **
ALB 0.170787 0.051974 3.286 0.001016 **
ALP 0.091808 0.011753 7.812 5.65e-15 ***
ALT 0.033121 0.007777 4.259 2.06e-05 ***
AST -0.163989 0.021844 -7.507 6.04e-14 ***
BIL -0.084453 0.025373 -3.328 0.000873 ***
CHE -0.239305 0.104772 -2.284 0.022368 *
CHOL 0.646587 0.210365 3.074 0.002115 **
CREA -0.024328 0.004495 -5.412 6.22e-08 ***
GGT -0.038332 0.006404 -5.986 2.15e-09 ***
PROT -0.182912 0.040763 -4.487 7.22e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1183.90 on 853 degrees of freedom
Residual deviance: 251.74 on 841 degrees of freedom
AIC: 277.74
Number of Fisher Scoring iterations: 9
# Predict on the test set
smote_predictions <- predict(smote_logistic_model, newdata = test_data, type = "response")
# Convert probabilities to class labels (threshold = 0.5)
smote_predicted_classes <- ifelse(smote_predictions > 0.5, 1, 0)
# Evaluate the model
smote_confusion_matrix <- confusionMatrix(as.factor(smote_predicted_classes), as.factor(test_data$BinaryCategory))
print("Confusion Matrix after applying SMOTE:")[1] "Confusion Matrix after applying SMOTE:"
print(smote_confusion_matrix)Confusion Matrix and Statistics
Reference
Prediction 0 1
0 14 7
1 2 99
Accuracy : 0.9262
95% CI : (0.8646, 0.9657)
No Information Rate : 0.8689
P-Value [Acc > NIR] : 0.03364
Kappa : 0.7142
Mcnemar's Test P-Value : 0.18242
Sensitivity : 0.8750
Specificity : 0.9340
Pos Pred Value : 0.6667
Neg Pred Value : 0.9802
Prevalence : 0.1311
Detection Rate : 0.1148
Detection Prevalence : 0.1721
Balanced Accuracy : 0.9045
'Positive' Class : 0
# Plot ROC curve for the SMOTE model
smote_roc_curve <- roc(as.numeric(test_data$BinaryCategory), smote_predictions)
plot(smote_roc_curve, col = "red", main = "ROC Curve for Logistic Regression (SMOTE)")Probability Distribution Visualization for Logistic Regression
test_data$Predicted_Probabilities <- smote_predictions
ggplot(test_data, aes(x = Predicted_Probabilities, fill = BinaryCategory)) +
geom_density(alpha = 0.5) +
labs(title = "Probability Distribution for Logistic Regression",
x = "Predicted Probability",
y = "Density",
fill = "Actual Class") +
theme_minimal()GMM Clustering
# Step 1: Load the dataset
# Assuming the data has been preprocessed (scaled, PCA applied, and features weighted)
hepatitis_data <- read.csv("HepatitisC_Cleaned.csv")
# Select numeric columns for clustering
numeric_data <- hepatitis_data[sapply(hepatitis_data, is.numeric)]
scaled_data <- scale(numeric_data)
# Perform PCA for dimensionality reduction
pca <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
pca_data <- pca$x[, 1:2] # Retain the first two principal components
# Step 2: Apply Gaussian Mixture Models (GMM)
set.seed(123) # Ensure reproducibility
gmm_model <- Mclust(pca_data)
# Optimal number of clusters
cat("Optimal Number of Clusters (GMM):", gmm_model$G, "\n")Optimal Number of Clusters (GMM): 2
# Step 3: Evaluate Clustering Quality
# Silhouette Score
silhouette_scores <- silhouette(gmm_model$classification, dist(pca_data))
avg_silhouette <- mean(silhouette_scores[, 3])
cat("Average Silhouette Score (GMM):", round(avg_silhouette, 2), "\n")Average Silhouette Score (GMM): 0.62
# Adjusted Rand Index (ARI)
if ("Category" %in% colnames(hepatitis_data)) {
actual_labels <- as.numeric(factor(hepatitis_data$Category)) # Replace 'Category' with your actual label column
ari <- adjustedRandIndex(actual_labels, gmm_model$classification)
cat("Adjusted Rand Index (ARI):", round(ari, 2), "\n")
} else {
cat("No actual labels provided for ARI calculation.\n")
}Adjusted Rand Index (ARI): 0.63
# Step 5: Add Cluster Assignments to Dataset
hepatitis_data$Cluster <- as.factor(gmm_model$classification)
# Step 6: Analyze Clusters
# Summary statistics for each cluster
cluster_summary <- hepatitis_data %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE))
# Enhanced GMM Clustering Visualization with Labels
ggplot(as.data.frame(pca_data), aes(x = PC1, y = PC2, color = as.factor(gmm_model$classification))) +
geom_point(size = 3, alpha = 0.7) + # Data points
stat_ellipse(aes(fill = as.factor(gmm_model$classification)), alpha = 0.2, geom = "polygon") + # Ellipses
scale_color_manual(values = c("#4CAF50", "#FF5722"), name = "Cluster",
labels = c("Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
scale_fill_manual(values = c("#4CAF50", "#FF5722"), name = "Cluster",
labels = c("Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
labs(
title = "Gaussian Mixture Model Clustering",
x = "Principal Component 1",
y = "Principal Component 2"
) +
theme_minimal(base_size = 15) +
theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "right",
legend.text = element_text(size = 10),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10)
)DBSCAN Clustering
# Scale the data
scaled_data <- scale(pca_data)
# Apply DBSCAN with eps = 1.0 and minPts = 5 (rule of thumb: dimensions + 1)
dbscan_model <- dbscan(scaled_data, eps = 1, minPts = 5)
# Calculate cluster sizes
cluster_sizes <- table(dbscan_model$cluster)
# Calculate the percentage of noise
noise_percentage <- sum(dbscan_model$cluster == 0) / nrow(scaled_data) * 100
# Compute silhouette scores
silhouette_scores <- silhouette(dbscan_model$cluster, dist(scaled_data))
avg_silhouette <- mean(silhouette_scores[, 3])
# Visualize DBSCAN clusters
ggplot(as.data.frame(pca_data), aes(x = PC1, y = PC2, color = as.factor(dbscan_model$cluster))) +
geom_point(size = 3, alpha = 0.7) +
scale_color_manual(
values = c("#999999", "#4CAF50", "#FF5722"),
name = "Cluster",
labels = c("Noise", "Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
labs(
title = "DBSCAN Clustering Results",
x = "Principal Component 1",
y = "Principal Component 2"
) +
theme_minimal(base_size = 15) +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "right",
legend.text = element_text(size = 10),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10)
)cat("DBSCAN Results Summary:\n")DBSCAN Results Summary:
cat("Number of Clusters:", length(unique(dbscan_model$cluster)) - 1, "\n") # Exclude noise clusterNumber of Clusters: 2
cat("Noise Percentage:", round(noise_percentage, 2), "%\n")Noise Percentage: 2.11 %
cat("Average Silhouette Score:", round(avg_silhouette, 2), "\n")Average Silhouette Score: 0.65
cat("\nAdditional Insights:\n")
Additional Insights:
cat("- Noise (Cluster 0) contains", cluster_sizes[1], "points.\n")- Noise (Cluster 0) contains 13 points.
cat("- Largest cluster size:", max(cluster_sizes[-1]), "points (excluding noise).\n")- Largest cluster size: 598 points (excluding noise).
cat("- Smallest cluster size:", min(cluster_sizes[-1]), "points (excluding noise).\n")- Smallest cluster size: 4 points (excluding noise).
Classification Analysis for the dataset Hepatitis-C - SVM
# Load the dataset
hepatitis <- read.csv("HepatitisC_Cleaned.csv")
# Preprocessing
# Encode 'Sex' as a factor
hepatitis$Sex <- as.factor(hepatitis$Sex)
# Convert 'Category' to a factor for multi-class classification
hepatitis$Category <- as.factor(hepatitis$Category)
# Split the data into training and testing sets
set.seed(123) # For reproducibility
trainIndex <- createDataPartition(hepatitis$Category, p = 0.8, list = FALSE)
trainData <- hepatitis[trainIndex, ]
testData <- hepatitis[-trainIndex, ]
# Identify numeric features
numeric_features <- names(which(sapply(trainData, is.numeric)))
# Scale numeric features
scaled_train <- trainData
scaled_test <- testData
for (col in numeric_features) {
# Get scaling parameters from the training set
col_mean <- mean(trainData[[col]], na.rm = TRUE)
col_sd <- sd(trainData[[col]], na.rm = TRUE)
# Scale training and test data
scaled_train[[col]] <- (trainData[[col]] - col_mean) / col_sd
scaled_test[[col]] <- (testData[[col]] - col_mean) / col_sd
}
# Train the SVM model with a radial basis function kernel
svm_model <- svm(Category ~ ., data = scaled_train, kernel = "radial", cost = 1, gamma = 0.1)
# Predict on the test set
svm_predictions <- predict(svm_model, newdata = scaled_test)
# Evaluate the model
conf_matrix <- confusionMatrix(svm_predictions, scaled_test$Category)
print("Confusion Matrix for SVM:")[1] "Confusion Matrix for SVM:"
print(conf_matrix)Confusion Matrix and Statistics
Reference
Prediction 0=Blood Donor 0s=suspect Blood Donor 1=Hepatitis
0=Blood Donor 106 0 0
0s=suspect Blood Donor 0 0 0
1=Hepatitis 0 0 3
2=Fibrosis 0 0 0
3=Cirrhosis 0 1 1
Reference
Prediction 2=Fibrosis 3=Cirrhosis
0=Blood Donor 0 0
0s=suspect Blood Donor 0 0
1=Hepatitis 0 0
2=Fibrosis 3 1
3=Cirrhosis 1 5
Overall Statistics
Accuracy : 0.9669
95% CI : (0.9175, 0.9909)
No Information Rate : 0.876
P-Value [Acc > NIR] : 0.0004865
Kappa : 0.8546
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 0=Blood Donor Class: 0s=suspect Blood Donor
Sensitivity 1.000 0.000000
Specificity 1.000 1.000000
Pos Pred Value 1.000 NaN
Neg Pred Value 1.000 0.991736
Prevalence 0.876 0.008264
Detection Rate 0.876 0.000000
Detection Prevalence 0.876 0.000000
Balanced Accuracy 1.000 0.500000
Class: 1=Hepatitis Class: 2=Fibrosis Class: 3=Cirrhosis
Sensitivity 0.75000 0.75000 0.83333
Specificity 1.00000 0.99145 0.97391
Pos Pred Value 1.00000 0.75000 0.62500
Neg Pred Value 0.99153 0.99145 0.99115
Prevalence 0.03306 0.03306 0.04959
Detection Rate 0.02479 0.02479 0.04132
Detection Prevalence 0.02479 0.03306 0.06612
Balanced Accuracy 0.87500 0.87073 0.90362
# Specify the features to include in the plot
selected_features <- c("Age", "ALB", "ALP", "ALT", "AST", "BIL")
# Filter the dataset for specific categories
filtered_data <- scaled_train %>%
filter(Category %in% c("1=Hepatitis", "2=Fibrosis", "3=Cirrhosis")) # Assuming "1" = Hepatitis, "2" = Fibrosis, "3" = Cirrhosis
# Parallel coordinates plot for selected features and categories
ggparcoord(data = filtered_data,
columns = which(names(filtered_data) %in% selected_features),
groupColumn = "Category",
scale = "std", alphaLines = 0.5) +
labs(title = "Parallel Coordinates Plot for Hepatitis, Fibrosis, and Cirrhosis",
x = "Features", y = "Scaled Values") +
theme_minimal(base_size = 14)Association rule using Apriori Algorithm
# 1. Data Preparation
hepatitis_categorized <- hepatitis %>%
mutate(
Age_Group = case_when(
Age < 35 ~ "Young",
Age < 50 ~ "Middle",
Age < 65 ~ "Senior",
TRUE ~ "Elderly"
),
ALT_Level = case_when(
ALT < 40 ~ "Normal",
ALT < 80 ~ "Mild_Elevation",
ALT < 200 ~ "Moderate_Elevation",
TRUE ~ "Severe_Elevation"
),
AST_Level = case_when(
AST < 40 ~ "Normal",
AST < 80 ~ "Mild_Elevation",
AST < 200 ~ "Moderate_Elevation",
TRUE ~ "Severe_Elevation"
),
Disease_Status = case_when(
Category == 0 ~ "Blood_Donor",
Category == 1 ~ "Suspect",
Category == 2 ~ "Hepatitis",
Category == 3 ~ "Fibrosis",
Category == 4 ~ "Cirrhosis"
)
)
# 2. Create transactions
hepatitis_trans <-
as(data.frame(select(hepatitis_categorized,
Age_Group, ALT_Level, AST_Level, Disease_Status)),
"transactions")
# 3. Apply Apriori algorithm
rules <- apriori(hepatitis_trans,
parameter = list(
support = 0.03,
confidence = 0.5,
minlen = 2,
maxlen = 4
))Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen
0.5 0.1 1 none FALSE TRUE 5 0.03 2
maxlen target ext
4 rules TRUE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 18
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[12 item(s), 615 transaction(s)] done [0.00s].
sorting and recoding items ... [9 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [25 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
# 4. Process rules
rules_pruned <-
rules[!is.redundant(rules)]
rules_sorted <-
sort(rules_pruned, by = "lift", decreasing = TRUE)
# 5. Create rules dataframe
rules_df <- data.frame(
lhs = labels(lhs(rules_sorted)),
rhs = labels(rhs(rules_sorted)),
support = quality(rules_sorted)$support,
confidence = quality(rules_sorted)$confidence,
lift = quality(rules_sorted)$lift
)
# 6. Visualization
# Scatter Plot
scatter_plot <- plot_ly(rules_df,
x = ~support,
y = ~confidence,
color = ~lift,
type = "scatter",
mode = "markers",
marker = list(size = 10),
text = ~paste("Rule:", lhs, "=>", rhs)) %>%
layout(
title = "Apriori Algorithm",
xaxis = list(title = "Support"),
yaxis = list(title = "Confidence"),
colorbar = list(title = "Lift")
)
# 7. Print Quality Assessment
cat("\nAssociation Rule Analysis Summary:")
Association Rule Analysis Summary:
cat("\nNumber of Rules:", nrow(rules_df))
Number of Rules: 21
cat("\nAverage Confidence (Accuracy):", round(mean(rules_df$confidence), 4))
Average Confidence (Accuracy): 0.7977
cat("\nAverage Lift:", round(mean(rules_df$lift), 4))
Average Lift: 1.0168
cat("\nAverage Support:", round(mean(rules_df$support), 4))
Average Support: 0.2092
# Print top 10 rules
cat("\n\nTop 10 Rules by Lift:\n")
Top 10 Rules by Lift:
head(rules_df[order(-rules_df$lift),
c("lhs", "rhs", "support", "confidence", "lift")], 10) lhs rhs support
1 {ALT_Level=Mild_Elevation,AST_Level=Normal} {Age_Group=Middle} 0.05691057
2 {ALT_Level=Mild_Elevation} {Age_Group=Middle} 0.07154472
3 {Age_Group=Young,AST_Level=Normal} {ALT_Level=Normal} 0.08780488
4 {AST_Level=Mild_Elevation} {Age_Group=Middle} 0.05040650
5 {Age_Group=Elderly,AST_Level=Normal} {ALT_Level=Normal} 0.03577236
6 {Age_Group=Young,ALT_Level=Normal} {AST_Level=Normal} 0.08780488
7 {Age_Group=Middle,ALT_Level=Normal} {AST_Level=Normal} 0.35934959
8 {Age_Group=Senior,AST_Level=Normal} {ALT_Level=Normal} 0.26829268
9 {ALT_Level=Normal} {AST_Level=Normal} 0.75121951
10 {AST_Level=Normal} {ALT_Level=Normal} 0.75121951
confidence lift
1 0.6034483 1.258036
2 0.5789474 1.206958
3 0.9473684 1.120445
4 0.5254237 1.095375
5 0.9166667 1.084135
6 0.9152542 1.082464
7 0.9132231 1.080062
8 0.9016393 1.066362
9 0.8884615 1.050777
10 0.8884615 1.050777
# Display plots
scatter_plotCHARM Algorithm
# 1. Data Preprocessing
hepatitis_categorized <- hepatitis %>%
mutate(
Age_Group = case_when(
Age < 35 ~ "Young",
Age < 50 ~ "Middle",
Age < 65 ~ "Senior",
TRUE ~ "Elderly"
),
ALT_Level = case_when(
ALT < 40 ~ "ALT_Normal",
ALT < 80 ~ "ALT_Mild_Elevation",
ALT < 200 ~ "ALT_Moderate_Elevation",
TRUE ~ "ALT_Severe_Elevation"
),
AST_Level = case_when(
AST < 40 ~ " AST_Normal",
AST < 80 ~ " AST_Mild_Elevation",
AST < 200 ~ " AST_Moderate_Elevation",
TRUE ~ " AST_Severe_Elevation"
),
Disease_Status = case_when(
Category == 0 ~ "Blood_Donor",
Category == 1 ~ "Suspect",
Category == 2 ~ "Hepatitis",
Category == 3 ~ "Fibrosis",
Category == 4 ~ "Cirrhosis"
)
)
# 2. CHARM Implementation
charm_association_rules <- function(data, min_support = 0.05, min_confidence = 0.5) {
# Convert to transaction format
trans_matrix <- as.matrix(data)
n_trans <- nrow(trans_matrix)
# Find unique items
items <- unique(as.character(trans_matrix))
cat("Initial items:", length(items), "\n")
# Create vertical database (tid-lists)
tid_lists <- list()
frequent_items <- character()
# Find frequent single items
for(item in items) {
tids <- which(apply(trans_matrix, 1, function(x) item %in% x))
support <- length(tids)/n_trans
if(support >= min_support) {
tid_lists[[item]] <- tids
frequent_items <- c(frequent_items, item)
}
}
cat("Frequent items:", length(frequent_items), "\n")
# Store closed itemsets
closed_itemsets <- list()
# Find closed itemsets
find_closed_itemsets <- function(prefix = character(), prefix_tids = NULL,
remaining_items = frequent_items) {
for(i in seq_along(remaining_items)) {
item <- remaining_items[i]
new_tids <- if(is.null(prefix_tids)) tid_lists[[item]]
else intersect(prefix_tids, tid_lists[[item]])
support <- length(new_tids)/n_trans
if(support >= min_support) {
new_itemset <- c(prefix, item)
# Check if closed
is_closed <- TRUE
for(existing in names(closed_itemsets)) {
if(all(new_itemset %in% strsplit(existing, ",")[[1]]) &&
closed_itemsets[[existing]]$support == support) {
is_closed <- FALSE
break
}
}
if(is_closed) {
closed_itemsets[[paste(new_itemset, collapse=",")]] <<- list(
items = new_itemset,
support = support,
tids = new_tids
)
}
# Recursive call with remaining items
if(i < length(remaining_items)) {
find_closed_itemsets(new_itemset, new_tids,
remaining_items[(i+1):length(remaining_items)])
}
}
}
}
# Generate closed itemsets
cat("Generating closed itemsets...\n")
find_closed_itemsets()
cat("Found", length(closed_itemsets), "closed itemsets\n")
# Generate rules from closed itemsets
rules <- data.frame(
lhs = character(),
rhs = character(),
support = numeric(),
confidence = numeric(),
lift = numeric(),
stringsAsFactors = FALSE
)
# Generate rules
cat("Generating association rules...\n")
for(itemset_key in names(closed_itemsets)) {
itemset <- closed_itemsets[[itemset_key]]
if(length(itemset$items) >= 2) {
for(i in 1:(length(itemset$items)-1)) {
combs <- combn(itemset$items, i, simplify = FALSE)
for(lhs in combs) {
rhs <- setdiff(itemset$items, lhs)
# Calculate confidence
lhs_support <- max(sapply(names(closed_itemsets), function(key) {
if(all(lhs %in% closed_itemsets[[key]]$items))
return(closed_itemsets[[key]]$support)
return(0)
}))
if(lhs_support > 0) {
confidence <- itemset$support / lhs_support
if(confidence >= min_confidence) {
# Calculate lift
rhs_support <- max(sapply(names(closed_itemsets), function(key) {
if(all(rhs %in% closed_itemsets[[key]]$items))
return(closed_itemsets[[key]]$support)
return(0)
}))
lift <- confidence / rhs_support
rules[nrow(rules) + 1,] <- list(
lhs = paste(lhs, collapse=", "),
rhs = paste(rhs, collapse=", "),
support = itemset$support,
confidence = confidence,
lift = lift
)
}
}
}
}
}
}
return(rules)
}
# 3. Apply CHARM to dataset
selected_data <- select(hepatitis_categorized,
Age_Group, ALT_Level, AST_Level, Disease_Status)
rules <- charm_association_rules(selected_data, min_support = 0.05, min_confidence = 0.5)Initial items: 13
Frequent items: 10
Generating closed itemsets...
Found 24 closed itemsets
Generating association rules...
# 4. Visualize results
if(nrow(rules) > 0) {
# Scatter plot
scatter_plot <- plot_ly(rules,
x = ~confidence,
y = ~lift,
color = ~support,
type = "scatter",
mode = "markers",
text = ~paste("Rule:", lhs, "=>", rhs,
"<br>Support:", round(support, 4),
"<br>Confidence:", round(confidence, 4),
"<br>Lift:", round(lift, 4))
) %>%
layout(
title = "CHARM Algotrithm",
xaxis = list(title = "Confidence"),
yaxis = list(title = "Lift"),
colorbar = list(title = "Support")
)
# Print statistics
cat("\nAssociation Rule Analysis Results:\n")
cat("Total rules found:", nrow(rules), "\n")
cat("Average confidence:", round(mean(rules$confidence), 4), "\n")
cat("Average lift:", round(mean(rules$lift), 4), "\n")
cat("Average support:", round(mean(rules$support), 4), "\n")
# Print top rules
cat("\nTop 10 Rules by Lift:\n")
print(head(rules[order(-rules$lift), ], 10))
# Save results
write.csv(rules, "charm_rules.csv", row.names = FALSE)
} else {
cat("No rules found. Try adjusting the minimum support or confidence thresholds.\n")
}
Association Rule Analysis Results:
Total rules found: 23
Average confidence: 0.8
Average lift: 1.0279
Average support: 0.2454
Top 10 Rules by Lift:
lhs rhs support
12 ALT_Mild_Elevation, AST_Normal Middle 0.05691057
10 ALT_Mild_Elevation Middle 0.07154472
4 Young, AST_Normal ALT_Normal 0.08780488
14 AST_Mild_Elevation Middle 0.05040650
3 Young, ALT_Normal AST_Normal 0.08780488
8 Middle, ALT_Normal AST_Normal 0.35934959
18 Senior, AST_Normal ALT_Normal 0.26829268
2 Young ALT_Normal, AST_Normal 0.08780488
20 ALT_Normal AST_Normal 0.75121951
21 AST_Normal ALT_Normal 0.75121951
confidence lift
12 0.6034483 1.258036
10 0.5789474 1.206958
4 0.9473684 1.120445
14 0.5254237 1.095375
3 0.9152542 1.082464
8 0.9132231 1.080062
18 0.9016393 1.066362
2 0.7941176 1.057105
20 0.8884615 1.050777
21 0.8884615 1.050777
# Display plot
scatter_plot